SIMARG DATASET
!pip install imblearn
! pip install -U scikit-fuzzy
! pip install fuzzy-c-means
!pip install imblearn
!pip install fcmeans
Requirement already satisfied: imblearn in c:\users\lr999\anaconda3\lib\site-packages (0.0) Requirement already satisfied: imbalanced-learn in c:\users\lr999\anaconda3\lib\site-packages (from imblearn) (0.8.1) Requirement already satisfied: joblib>=0.11 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.17.0) Requirement already satisfied: scikit-learn>=0.24 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.0.1) Requirement already satisfied: numpy>=1.13.3 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.21.5) Requirement already satisfied: scipy>=0.19.1 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.5.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-learn>=0.24->imbalanced-learn->imblearn) (2.1.0) Requirement already up-to-date: scikit-fuzzy in c:\users\lr999\anaconda3\lib\site-packages (0.4.2) Requirement already satisfied, skipping upgrade: networkx>=1.9.0 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-fuzzy) (2.5) Requirement already satisfied, skipping upgrade: numpy>=1.6.0 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-fuzzy) (1.21.5) Requirement already satisfied, skipping upgrade: scipy>=0.9.0 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-fuzzy) (1.5.2) Requirement already satisfied, skipping upgrade: decorator>=4.3.0 in c:\users\lr999\anaconda3\lib\site-packages (from networkx>=1.9.0->scikit-fuzzy) (4.4.2) Collecting fuzzy-c-means Using cached fuzzy_c_means-1.6.3-py3-none-any.whl (9.1 kB) Requirement already satisfied: tabulate<0.9.0,>=0.8.9 in c:\users\lr999\anaconda3\lib\site-packages (from fuzzy-c-means) (0.8.9) Collecting pydantic<2.0.0,>=1.8.2 Using cached pydantic-1.8.2-cp38-cp38-win_amd64.whl (2.0 MB) Requirement already satisfied: numpy<2.0.0,>=1.21.1 in c:\users\lr999\anaconda3\lib\site-packages (from fuzzy-c-means) (1.21.5) Collecting typer<0.4.0,>=0.3.2 Using cached typer-0.3.2-py3-none-any.whl (21 kB) Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\users\lr999\anaconda3\lib\site-packages (from pydantic<2.0.0,>=1.8.2->fuzzy-c-means) (3.7.4.3) Requirement already satisfied: click<7.2.0,>=7.1.1 in c:\users\lr999\anaconda3\lib\site-packages (from typer<0.4.0,>=0.3.2->fuzzy-c-means) (7.1.2) Installing collected packages: pydantic, typer, fuzzy-c-means Successfully installed fuzzy-c-means-1.6.3 pydantic-1.8.2 typer-0.3.2 Requirement already satisfied: imblearn in c:\users\lr999\anaconda3\lib\site-packages (0.0) Requirement already satisfied: imbalanced-learn in c:\users\lr999\anaconda3\lib\site-packages (from imblearn) (0.8.1) Requirement already satisfied: scipy>=0.19.1 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.5.2) Requirement already satisfied: joblib>=0.11 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.17.0) Requirement already satisfied: scikit-learn>=0.24 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.0.1) Requirement already satisfied: numpy>=1.13.3 in c:\users\lr999\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.21.5) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-learn>=0.24->imbalanced-learn->imblearn) (2.1.0)
ERROR: Could not find a version that satisfies the requirement fcmeans (from versions: none) ERROR: No matching distribution found for fcmeans
%matplotlib inline
import fcmeans
from fcmeans import FCM
# example of anova f-test feature selection for numerical data
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from matplotlib import pyplot
from collections import Counter
from numpy.random import RandomState
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
# IMPORT ALL NEEDED LIBRARIES
import numpy as np
import pandas as pd
import os
import time
import statistics
import seaborn as sns
# data visualization
import matplotlib.pyplot as plt
# methods for data preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# evaluation metrics
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
confusion_matrix,
classification_report,
f1_score,
homogeneity_score,
completeness_score,
calinski_harabasz_score
)
# CV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
# distribution probabilities
from scipy.stats import uniform, randint
# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from google.colab import drive
drive.mount('/content/drive')
#4/1AX4XfWgzEvAFE4dbW27C6ErZveeX2u2ze2-_ZxaGPDZgMh4d1IE212CHWkg
path = "../AI4sec_LDG/SIMARGL2021_dataset-part1_cleaned.csv" #insert google drive database path
path_lore = '/content/drive/MyDrive/SIMARGL2021_dataset-part1_cleaned.csv'
df = pd.read_csv(path)
Adding this to skip data cleaning part
path = "/content/drive/MyDrive/AI4SEC_LDG/df_SMOTE_before.csv" #insert google drive database path
df = pd.read_csv(path)
Basic stats
• How many rows and how many columns are there in the data?
print("Number of rows: ",df.shape[0]) # PRINT THE NUMBER OF ROWS
print("Number of columns: ",df.shape[1]) # PRINT THE NUMBER OF COLUMNS
Number of rows: 2434337 Number of columns: 50
• What are the names and datatypes in each column?
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2434337 entries, 0 to 2434336 Data columns (total 50 columns): # Column Dtype --- ------ ----- 0 BIFLOW_DIRECTION int64 1 DIRECTION int64 2 DST_TO_SRC_SECOND_BYTES int64 3 FIREWALL_EVENT int64 4 FIRST_SWITCHED int64 5 FLOW_ACTIVE_TIMEOUT int64 6 FLOW_DURATION_MICROSECONDS int64 7 FLOW_DURATION_MILLISECONDS int64 8 FLOW_END_MILLISECONDS int64 9 FLOW_END_SEC int64 10 FLOW_ID int64 11 FLOW_INACTIVE_TIMEOUT int64 12 FLOW_START_MILLISECONDS int64 13 FLOW_START_SEC int64 14 FRAME_LENGTH int64 15 IN_BYTES int64 16 IN_PKTS int64 17 IPV4_DST_ADDR object 18 IPV4_SRC_ADDR object 19 L4_DST_PORT int64 20 L4_SRC_PORT int64 21 LAST_SWITCHED int64 22 MAX_IP_PKT_LEN int64 23 MIN_IP_PKT_LEN int64 24 OOORDER_IN_PKTS int64 25 OOORDER_OUT_PKTS int64 26 OUT_BYTES int64 27 OUT_PKTS int64 28 PROTOCOL int64 29 PROTOCOL_MAP object 30 RETRANSMITTED_IN_BYTES int64 31 RETRANSMITTED_IN_PKTS int64 32 RETRANSMITTED_OUT_BYTES int64 33 RETRANSMITTED_OUT_PKTS int64 34 SRC_TO_DST_SECOND_BYTES int64 35 TCP_FLAGS int64 36 TCP_WIN_MAX_IN int64 37 TCP_WIN_MAX_OUT int64 38 TCP_WIN_MIN_IN int64 39 TCP_WIN_MIN_OUT int64 40 TCP_WIN_MSS_IN int64 41 TCP_WIN_MSS_OUT int64 42 TCP_WIN_SCALE_IN int64 43 TCP_WIN_SCALE_OUT int64 44 SRC_TOS int64 45 DST_TOS int64 46 L7_PROTO_NAME object 47 SAMPLING_INTERVAL int64 48 TOTAL_FLOWS_EXP int64 49 LABEL object dtypes: int64(45), object(5) memory usage: 928.6+ MB
• What percentage of SYN Scan - aggressive?
print("Percentage of SYN Scan - aggressive: " + str(len(df[df['LABEL']=='SYN Scan - aggressive']) / df.shape[0] * 100) + "%")
Percentage of SYN Scan - aggressive: 75.53173615649764%
for i in df.columns:
s = df[i].isna().sum() # CHECKING FOR EACH COLUMNS THE PERCENTAGE
print(s/len(df[i])*100 ) # OF NAN VALUES
#non NaN values in this dataset
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
As we can see, the label is not equally divided. So we made up with the idea of using some oversampling techniques to balance our data.
df['LABEL'].value_counts()
SYN Scan - aggressive 1838697 Normal flow 595640 Name: LABEL, dtype: int64
df.dtypes
BIFLOW_DIRECTION int64 DIRECTION int64 DST_TO_SRC_SECOND_BYTES int64 FIREWALL_EVENT int64 FIRST_SWITCHED int64 FLOW_ACTIVE_TIMEOUT int64 FLOW_DURATION_MICROSECONDS int64 FLOW_DURATION_MILLISECONDS int64 FLOW_END_MILLISECONDS int64 FLOW_END_SEC int64 FLOW_ID int64 FLOW_INACTIVE_TIMEOUT int64 FLOW_START_MILLISECONDS int64 FLOW_START_SEC int64 FRAME_LENGTH int64 IN_BYTES int64 IN_PKTS int64 IPV4_DST_ADDR object IPV4_SRC_ADDR object L4_DST_PORT int64 L4_SRC_PORT int64 LAST_SWITCHED int64 MAX_IP_PKT_LEN int64 MIN_IP_PKT_LEN int64 OOORDER_IN_PKTS int64 OOORDER_OUT_PKTS int64 OUT_BYTES int64 OUT_PKTS int64 PROTOCOL int64 PROTOCOL_MAP object RETRANSMITTED_IN_BYTES int64 RETRANSMITTED_IN_PKTS int64 RETRANSMITTED_OUT_BYTES int64 RETRANSMITTED_OUT_PKTS int64 SRC_TO_DST_SECOND_BYTES int64 TCP_FLAGS int64 TCP_WIN_MAX_IN int64 TCP_WIN_MAX_OUT int64 TCP_WIN_MIN_IN int64 TCP_WIN_MIN_OUT int64 TCP_WIN_MSS_IN int64 TCP_WIN_MSS_OUT int64 TCP_WIN_SCALE_IN int64 TCP_WIN_SCALE_OUT int64 SRC_TOS int64 DST_TOS int64 L7_PROTO_NAME object SAMPLING_INTERVAL int64 TOTAL_FLOWS_EXP int64 LABEL object dtype: object
With this simple command below we can analyze every possible value of every feature. We can notice that some features have a super high cardinality, sometimes even equal to the number of rows. Those features are gonna be dropped because won't give any useful information in our classification problem.
for col in df.columns:
print(df[col].value_counts())
1 2434337
Name: BIFLOW_DIRECTION, dtype: int64
0 2434324
1 13
Name: DIRECTION, dtype: int64
40 1953162
104 32897
52 31730
76 10140
160 9968
...
9595 1
17783 1
43669 1
7550 1
9171 1
Name: DST_TO_SRC_SECOND_BYTES, Length: 10322, dtype: int64
0 2434337
Name: FIREWALL_EVENT, dtype: int64
1616660095 11379
1616661813 10687
1616672658 8463
1616669083 8326
1616665357 6442
...
1616672486 10
1616664975 9
1616664904 8
1616672544 8
1616664925 7
Name: FIRST_SWITCHED, Length: 14401, dtype: int64
120 2434337
Name: FLOW_ACTIVE_TIMEOUT, dtype: int64
778 3435
765 3413
774 3388
803 3371
783 3369
...
50972 1
115744 1
123940 1
103466 1
242570 1
Name: FLOW_DURATION_MICROSECONDS, Length: 147160, dtype: int64
0 1590879
1 427310
2 33494
28 31823
27 20906
...
946 1
944 1
924 1
939 1
987 1
Name: FLOW_DURATION_MILLISECONDS, Length: 972, dtype: int64
1616660095548 61
1616660095824 58
1616661813686 56
1616669083626 48
1616672658922 47
..
1616668136805 1
1616671012846 1
1616662603748 1
1616671020084 1
1616661307580 1
Name: FLOW_END_MILLISECONDS, Length: 1480441, dtype: int64
1616660095 11379
1616661813 10687
1616672658 8463
1616669083 8326
1616665357 6442
...
1616672486 10
1616664975 9
1616664904 8
1616672544 8
1616664925 7
Name: FLOW_END_SEC, Length: 14401, dtype: int64
4196351 1
4887550 1
4883452 1
4897787 1
4893689 1
..
4631939 1
4629890 1
4636033 1
4633984 1
4196353 1
Name: FLOW_ID, Length: 2434337, dtype: int64
30 2434337
Name: FLOW_INACTIVE_TIMEOUT, dtype: int64
1616661813685 95
1616669083625 57
1616665357890 56
1616660095823 55
1616665358285 53
..
1616671332204 1
1616667094899 1
1616671928647 1
1616671295358 1
1616661311678 1
Name: FLOW_START_MILLISECONDS, Length: 1339460, dtype: int64
1616660095 11379
1616661813 10687
1616672658 8463
1616669083 8326
1616665357 6442
...
1616672486 10
1616664975 9
1616664904 8
1616672544 8
1616664925 7
Name: FLOW_START_SEC, Length: 14401, dtype: int64
0 2434337
Name: FRAME_LENGTH, dtype: int64
44 1841838
189 56883
52 38887
72 34438
41 29230
...
11059 1
4919 1
4918 1
2869 1
8951 1
Name: IN_BYTES, Length: 7278, dtype: int64
1 2175712
2 173107
3 16854
4 10265
5 9312
...
247 1
177 1
244 1
241 1
1715 1
Name: IN_PKTS, Length: 279, dtype: int64
10.114.226.5 154807
10.114.224.65 112049
10.114.224.101 69978
10.114.224.106 67697
10.114.224.102 67574
...
80.231.63.245 1
90.45.8.249 1
46.223.163.160 1
173.245.59.169 1
34.252.7.224 1
Name: IPV4_DST_ADDR, Length: 11138, dtype: int64
10.114.241.166 1838697
10.114.226.5 85263
10.114.225.202 52751
10.114.225.212 44785
10.114.241.165 44086
...
173.194.187.134 1
74.125.190.137 1
104.244.42.195 1
149.129.246.73 1
66.249.93.116 1
Name: IPV4_SRC_ADDR, Length: 9953, dtype: int64
53 308066
443 137621
8291 14224
8728 14121
11319 9209
...
24490 25
41780 24
35909 24
9697 23
0 12
Name: L4_DST_PORT, Length: 65536, dtype: int64
49726 654755
49214 392896
49470 392890
48958 261989
49982 130974
...
32077 1
15339 1
9192 1
19787 1
5706 1
Name: L4_SRC_PORT, Length: 55450, dtype: int64
1616660095 11379
1616661813 10687
1616672658 8463
1616669083 8326
1616665357 6442
...
1616672486 10
1616664975 9
1616664904 8
1616672544 8
1616664925 7
Name: LAST_SWITCHED, Length: 14401, dtype: int64
0 2434337
Name: MAX_IP_PKT_LEN, dtype: int64
0 2434337
Name: MIN_IP_PKT_LEN, dtype: int64
0 2434317
1 17
3 2
2 1
Name: OOORDER_IN_PKTS, dtype: int64
0 2434279
1 58
Name: OOORDER_OUT_PKTS, dtype: int64
40 1953162
104 32897
52 31730
76 10140
160 9968
...
9595 1
17783 1
43669 1
7550 1
9171 1
Name: OUT_BYTES, Length: 10322, dtype: int64
1 2241379
2 125329
4 10099
9 7185
5 6841
...
338 1
341 1
342 1
346 1
293 1
Name: OUT_PKTS, Length: 352, dtype: int64
6 2114931
17 319406
Name: PROTOCOL, dtype: int64
tcp 2114931
udp 319406
Name: PROTOCOL_MAP, dtype: int64
0 2427027
40 4829
153 1577
80 284
44 37
...
462 1
520 1
569 1
60060 1
336 1
Name: RETRANSMITTED_IN_BYTES, Length: 135, dtype: int64
0 2427027
1 6554
2 335
3 27
32 25
28 25
27 23
24 22
26 20
25 19
23 16
29 15
22 14
33 14
20 14
30 13
18 12
34 12
4 11
31 11
11 11
5 10
21 10
10 10
16 9
14 8
12 7
17 7
13 6
19 6
7 6
15 6
8 5
9 4
36 4
35 4
39 2
6 2
37 2
101 1
44 1
42 1
40 1
38 1
45 1
57 1
75 1
108 1
Name: RETRANSMITTED_IN_PKTS, dtype: int64
0 2427243
14300 1249
17160 1075
15730 811
12870 548
...
806 1
9000 1
823 1
824 1
655 1
Name: RETRANSMITTED_OUT_BYTES, Length: 405, dtype: int64
0 2427243
10 1253
12 1079
11 814
1 735
...
68 1
62 1
58 1
50 1
480 1
Name: RETRANSMITTED_OUT_PKTS, Length: 83, dtype: int64
44 1841838
189 56883
52 38887
72 34438
41 29230
...
11059 1
4919 1
4918 1
2869 1
8951 1
Name: SRC_TO_DST_SECOND_BYTES, Length: 7278, dtype: int64
22 1911862
0 319406
25 36990
16 36801
24 34217
28 29672
26 22171
27 15608
31 7633
29 7178
19 5085
17 3476
23 1603
30 1117
20 568
21 357
219 227
218 223
214 51
18 35
223 32
54 17
222 5
211 1
157 1
153 1
Name: TCP_FLAGS, dtype: int64
1024 1844196
0 319514
64240 42189
8192 33326
118 25258
...
937 1
11181 1
22305 1
19361 1
1001 1
Name: TCP_WIN_MAX_IN, Length: 5987, dtype: int64
0 2264103
65535 18171
27960 11343
502 9355
64240 7058
...
2699 1
6794 1
6795 1
1525 1
7765 1
Name: TCP_WIN_MAX_OUT, Length: 5202, dtype: int64
1024 1856429
0 319514
64240 42168
8192 37347
118 25310
...
64598 1
3144 1
21571 1
37978 1
3073 1
Name: TCP_WIN_MIN_IN, Length: 6076, dtype: int64
0 2264103
65535 18136
27960 11343
502 9480
64240 7056
...
2550 1
3825 1
63216 1
5121 1
7985 1
Name: TCP_WIN_MIN_OUT, Length: 5089, dtype: int64
1460 1896266
0 487431
1452 30359
1390 10851
1412 3256
1440 1616
1410 1101
1420 964
1400 644
1360 267
1380 218
536 211
1414 137
265 127
1300 105
1424 100
1416 84
1370 72
1200 71
1382 40
1260 34
1220 32
1448 31
1436 27
1398 27
1208 25
1459 20
1356 20
1318 19
1456 16
1422 16
1350 15
1340 15
1104 14
1434 11
1310 10
1357 9
1464 9
1320 8
1446 8
1103 8
1324 7
1270 5
8960 3
1304 3
1210 3
1146 3
1480 2
1450 2
1256 2
1444 2
1326 2
1402 2
1430 1
1431 1
1428 1
1408 1
1388 1
1328 1
1938 1
Name: TCP_WIN_MSS_IN, dtype: int64
0 2380219
1460 24259
1410 11718
1440 6895
1430 4991
1400 3049
1420 1031
1392 658
1450 607
1380 162
1436 139
8960 139
1412 86
1480 85
1452 53
8961 45
8860 43
1360 38
1300 26
1456 26
1408 17
1418 15
1432 10
1390 8
960 4
1182 2
1448 2
1240 2
65475 2
1220 1
1330 1
1372 1
8880 1
1414 1
4430 1
Name: TCP_WIN_MSS_OUT, dtype: int64
0 2341696
8 33647
7 31961
2 24781
6 1284
9 348
10 278
12 88
3 73
4 65
5 59
1 28
14 15
11 14
Name: TCP_WIN_SCALE_IN, dtype: int64
0 2382049
8 26215
7 18110
10 3683
9 2971
6 421
12 287
2 244
5 195
4 102
11 36
3 18
13 5
14 1
Name: TCP_WIN_SCALE_OUT, dtype: int64
0 2357375
72 30021
164 24097
40 14413
16 4012
184 1177
32 1159
2 975
64 218
24 166
104 107
42 105
36 76
136 62
96 59
192 54
128 51
166 41
8 25
224 24
208 23
20 21
80 15
4 10
43 8
56 7
112 5
44 2
19 2
52 1
63 1
73 1
25 1
22 1
253 1
74 1
152 1
206 1
191 1
186 1
175 1
173 1
172 1
165 1
161 1
148 1
81 1
140 1
139 1
121 1
114 1
107 1
103 1
230 1
86 1
100 1
Name: SRC_TOS, dtype: int64
0 2337758
164 39278
72 32818
40 17884
184 1770
...
103 1
100 1
180 1
81 1
253 1
Name: DST_TOS, Length: 63, dtype: int64
Unknown 1902143
DNS 159712
TLS 113836
DNS.Google 75403
DNS.Microsoft 19429
...
MySQL.Amazon 1
TLS.Whois-DAS 1
MySQL.LDAP 1
QUIC.Snapchat 1
TLS.ApplePush 1
Name: L7_PROTO_NAME, Length: 281, dtype: int64
1 2434337
Name: SAMPLING_INTERVAL, dtype: int64
4196351 1
4887550 1
4883452 1
4897787 1
4893689 1
..
4631939 1
4629890 1
4636033 1
4633984 1
4196353 1
Name: TOTAL_FLOWS_EXP, Length: 2434337, dtype: int64
SYN Scan - aggressive 1838697
Normal flow 595640
Name: LABEL, dtype: int64
df['LABEL'].value_counts()
SYN Scan - aggressive 1838697 Normal flow 595640 Name: LABEL, dtype: int64
The 2 possible values that can assume our LABEL feature are Normal Flow and SYN Scan - aggressive. The first one describes a normal connection, the second in some ways identifies a potential dangerous connection.
In our primary analysis can be useful to separate the malicious connections from the normal ones.
df_bad = df[df['LABEL']=='SYN Scan - aggressive']
df_good = df[df['LABEL']=='Normal flow']
As the label is not balanced, we will analyze a portion of the rows equivalent to the numeber of Normal flows (approximately 500k).
The feature below can, from a first sight, represent a possible good
possible_col = ['FLOW_DURATION_MICROSECONDS',
'OOORDER_OUT_PKTS',
'RETRANSMITTED_IN_PKTS', 'TCP_WIN_MAX_IN', 'IN_PKTS', 'IN_BYTES']
for col in possible_col:
print(col)
print('Mean for bad: ', df_bad[col].mean(), 'Median for bad: ', df_bad[col].median())
print('Mean for good: ', df_good[col].mean(), 'Median for good: ', df_good[col].median())
FLOW_DURATION_MICROSECONDS Mean for bad: 948.3793969316315 Median for bad: 815.0 Mean for good: 57774.47330266604 Median for good: 27738.0 OOORDER_OUT_PKTS Mean for bad: 0.0 Median for bad: 0.0 Mean for good: 9.737425290443892e-05 Median for good: 0.0 RETRANSMITTED_IN_PKTS Mean for bad: 2.610544314805539e-05 Median for bad: 0.0 Mean for good: 0.027686186287019004 Median for good: 0.0 TCP_WIN_MAX_IN Mean for bad: 1061.4409845667883 Median for bad: 1024.0 Mean for good: 8680.02401786314 Median for good: 0.0 IN_PKTS Mean for bad: 1.0031342847679634 Median for bad: 1.0 Mean for good: 2.6516771875629574 Median for good: 1.0 IN_BYTES Mean for bad: 44.30960838028234 Median for bad: 44.0 Mean for good: 444.296719494997 Median for good: 93.0
We can see how the measure of the flow duration is different in the two groups. It seems that normal connections lasts way more than the malicious ones.
Also the TCP max window size is way larger in normal connections.
IN_BYTES represents the number of incoming bytes from the connection. In the good connections the number of bytes is ten times that of malicious one.
binary_columns = [j for j in df.columns if df[j].nunique() == 2]
for col in binary_columns:
print(df[col].value_counts())
0 2434324 1 13 Name: DIRECTION, dtype: int64 0 2434279 1 58 Name: OOORDER_OUT_PKTS, dtype: int64 6 2114931 17 319406 Name: PROTOCOL, dtype: int64 tcp 2114931 udp 319406 Name: PROTOCOL_MAP, dtype: int64 SYN Scan - aggressive 1838697 Normal flow 595640 Name: LABEL, dtype: int64
An interesting analysis can be made on the protocol map feature, which identifies the protocol used (tcp or udp)
print('Number of connections using udp listed as malicious: ', len(df_bad[df_bad['PROTOCOL_MAP']=='udp']))
print('Number of connections using tcp listed as malicious: ', len(df_bad[df_bad['PROTOCOL_MAP']=='tcp']))
print('Number of connections using udp listed as normal: ', len(df_good[df_good['PROTOCOL_MAP']=='udp']))
print('Number of connections using tcp listed as normal: ', len(df_good[df_good['PROTOCOL_MAP']=='tcp']))
Number of connections using udp listed as malicious: 0 Number of connections using tcp listed as malicious: 1838697 Number of connections using udp listed as normal: 319406 Number of connections using tcp listed as normal: 276234
No UDP using connection is listed as malicious, we can assume this feature as a discriminant for a classification problem.
Malicious connections pass by TCP protocol.
LOW-HIGH VARIABILITY : Looking at the dataset we noticed that some categorical features were composed mainly by just one category, so we decided to delete those categorical columns where the frequency of just one category was more than 99% (cover almost all the values of the column) we decided to delete them. We also proceeded to remove columns were the most frequent category appeared less than 0.0001% bacuse it meant that it was almost different for every row (so we considered it as an "ID" meaningless column.
droppable = []
for col in df.columns:
n = df[col].value_counts().idxmax()
occ = df[col].value_counts()[n]
perc = (occ / df.shape[0]) * 100
if(perc > 99 or perc < 0.0001):
droppable.append(col)
df_dropped = df.drop(columns=droppable, axis=1)
df_dropped.shape
df = df_dropped
#NUMERIC, CATEGORICAL, BINARY FEATURES
df_drop = df
categorical_columns = [ #id -- Serial Flow Identifier
'IPV4_DST_ADDR',
'IPV4_SRC_ADDR',
'L4_DST_PORT', #IPv4 destination port -- number of the destination point
'L4_SRC_PORT', #IPv4 source port -- number of the source point
'PROTOCOL', #binary -- IP protocol byte (Id)
'PROTOCOL_MAP', #binary -- IP protocol name
'L7_PROTO_NAME',
'LABEL', #binary
]
df_drop = df_drop.drop(columns=categorical_columns)
numerical_columns = list(df_drop.columns)
binary_columns = [j for j in categorical_columns if df[j].nunique() <= 2]
def freq_enc (df, list_cat):
for i in list_cat:
count_map = df[i].value_counts().to_dict()
df[i] = df[i].map(count_map)
return df
df = freq_enc(df, categorical_columns)
X = df.drop(columns=["LABEL"])
y = df["LABEL"]
print(X.shape)
(2434337, 32)
The algorithm used for oversampling is SMOTE.
SMOTE is an oversampling technique where the synthetic samples are generated for the minority class. It focuses on the feature space to generate new instances with the help of interpolation between the positive instances that lie together.
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
X["LABEL"] = y
print(X.shape)
df = X
(3677394, 33)
scaler = MinMaxScaler()
# transform data
df[categorical_columns] = scaler.fit_transform(df[categorical_columns])
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
In the next cell it's performed a multivariate outlier detection with isolation forest. Once detected, the outliers have been removed.
Isolation forest works similar to a Random Forest Classifier. It uses contamination ratio as Threshold for considering point anomalous.
After found the possible outliers, we proceeded to discard them from our dataset.
df_num = df[numerical_columns]
from sklearn.ensemble import IsolationForest
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.12),
max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
pred = clf.fit_predict(df_num)
df_num['anomaly']=pred
outliers=df_num.loc[df_num['anomaly']==-1]
outlier_index=list(outliers.index)
print(outlier_index)
#Find the number of anomalies and normal points here points classified -1 are anomalous
print(df_num['anomaly'].value_counts())
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\base.py:445: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names warnings.warn( <ipython-input-29-7b33c5052cfd>:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['anomaly']=pred IOPub data rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_data_rate_limit`. Current values: NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec) NotebookApp.rate_limit_window=3.0 (secs)
print(df_num['anomaly'].value_counts())
1 3236242 -1 441152 Name: anomaly, dtype: int64
df['anomaly'] = df_num['anomaly']
df = df[df['anomaly'] == 1]
df = df.drop(columns='anomaly')
In this section we perform to detect which are the more important features through the chi2 criteria. We defined wich are the features with an importance bigger than the median of all the feature importances.
def X_y_tts(df):
X = df.drop(columns=["LABEL"])
y = df["LABEL"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = X_y_tts(df)
from sklearn.feature_selection import SelectKBest, chi2
def select_features(X_train, y_train, X_test):
# configure to select all features
fs = SelectKBest(score_func=chi2, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fs
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
dict_= {}
for i in range(len(fs.scores_)):
dict_[X_train.columns[i]] = fs.scores_[i]
print('Feature ', X_train.columns[i],': ', fs.scores_[i])
# plot the scores
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()
Feature DST_TO_SRC_SECOND_BYTES : 5.579787958406662 Feature FIRST_SWITCHED : 1135.9731052646669 Feature FLOW_DURATION_MICROSECONDS : 30438.853293367272 Feature FLOW_DURATION_MILLISECONDS : 31923.62069296834 Feature FLOW_END_MILLISECONDS : 1137.3843953524422 Feature FLOW_END_SEC : 1135.9731052646669 Feature FLOW_START_MILLISECONDS : 1137.2645896984525 Feature FLOW_START_SEC : 1135.9731052646669 Feature IN_BYTES : 26.812517833407426 Feature IN_PKTS : 144.21869799824364 Feature IPV4_DST_ADDR : 478.9652304643299 Feature IPV4_SRC_ADDR : 928757.8433240075 Feature L4_DST_PORT : 921922.8634034401 Feature L4_SRC_PORT : 656550.1751871831 Feature LAST_SWITCHED : 1135.9731052646669 Feature OUT_BYTES : 5.579787958406662 Feature OUT_PKTS : 17.569907486853165 Feature PROTOCOL : 347909.8766395627 Feature PROTOCOL_MAP : 347909.8766395627 Feature SRC_TO_DST_SECOND_BYTES : 26.812517833407426 Feature TCP_FLAGS : 32896.90697898307 Feature TCP_WIN_MAX_IN : 304.7049820047532 Feature TCP_WIN_MAX_OUT : 4571.136558574499 Feature TCP_WIN_MIN_IN : 298.61285441955215 Feature TCP_WIN_MIN_OUT : 4571.049621767727 Feature TCP_WIN_MSS_IN : 128946.28329771661 Feature TCP_WIN_MSS_OUT : 0.33843529689372476 Feature TCP_WIN_SCALE_IN : 9836.046588057001 Feature TCP_WIN_SCALE_OUT : 2.0643446738141775 Feature SRC_TOS : 39557.7941917804 Feature DST_TOS : 62653.43226690948 Feature L7_PROTO_NAME : 674971.7934155122
Here we check only for the features which have an importance greater than the median of all feature importances. Once we get the list of those more important features we create a dataframe based only on those features.
l=[]
for i in dict_.keys():
if dict_[i] > np.percentile(fs.scores_, 50):
l.append(i)
print(len(l))
16
l.append("LABEL")
df = df[l]
df.to_csv('df_SMOTE_before.csv',index=False)
result_class = pd.DataFrame(index = ['Accuracy', 'Precision', 'Recall', 'f1_score'])
Adaboost Classifier
The most important parameters are base_estimator, n_estimators and learning_rate.
base_estimator is the learning algorithm to use to train the weak models. This will almost always not needed to be changed because by far the most common learner to use with AdaBoost is a decision tree – this parameter’s default argument.
n_estimators is the number of models to iteratively train.
learning_rate is the contribution of each model to the weights and defaults to 1. Reducing the learning rate will mean the weights will be increased or decreased to a small degree, forcing the model train slower (but sometimes resulting in better performance scores).
loss is exclusive to AdaBoostRegressor and sets the loss function to use when updating weights. This defaults to a linear loss function however can be changed to square or exponential.
The advantages are as follows:
AdaBoost is easy to implement.
The disadvantages are as follows:
AdaBoost is sensitive to noise data.
It is highly affected by outliers because it tries to fit each point perfectly.
AdaBoost is slower compared to XGBoost.
X = df.loc[:, ~df.columns.isin(['LABEL'])]
y = df['LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# Create adaboost classifer object
abc = AdaBoostClassifier( n_estimators=50, learning_rate=1, random_state=42, algorithm='SAMME.R')
# Train Adaboost Classifer
model1 = abc.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = model1.predict(X_test)
# calculate and print model accuracy
result_class['AdaBoostClassifier'] = ([accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
])
#DEFINE XTEST YTEST ETC
X = df.loc[:, ~df.columns.isin(['LABEL'])]
y = df['LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# Create a based model
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
RandomForestClassifier(random_state=42)
y_pred = forest.predict(X_test)
result_class['RandomForest'] = ([accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
])
X = df.loc[:, ~df.columns.isin(['LABEL'])]
y = df['LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
xgb = XGBClassifier(max_depth=5, n_estimators=50, random_state=42)
xgb.fit(X_train, y_train)
C:\Users\lr999\anaconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[09:45:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=12,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
y_pred = xgb.predict(X_test)
result_class['XGBoost'] = ([accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
])
mlp = MLPClassifier(max_iter=300)
parameter_space = {
'hidden_layer_sizes': [(50,50,50), (50,100,50), (150, 100, 70)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
}
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)
GridSearchCV(cv=3, estimator=MLPClassifier(max_iter=300), n_jobs=-1,
param_grid={'activation': ['tanh', 'relu'],
'alpha': [0.0001, 0.05],
'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50),
(150, 100, 70)],
'learning_rate': ['constant', 'adaptive'],
'solver': ['sgd', 'adam']})
# Best paramete set
print('Best parameters found:\n', clf.best_params_)
print('\n')
# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print('\n')
Best parameters found:
{'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'constant', 'solver': 'adam'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'sgd'}
1.000 (+/-0.000) for {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 70), 'learning_rate': 'adaptive', 'solver': 'adam'}
y_true, y_pred = y_test , clf.predict(X_test)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))
Results on the test set:
precision recall f1-score support
0.0 1.00 1.00 1.00 699321
1.0 1.00 1.00 1.00 918734
accuracy 1.00 1618055
macro avg 1.00 1.00 1.00 1618055
weighted avg 1.00 1.00 1.00 1618055
display(result_class.T)
| Accuracy | Precision | Recall | f1_score | |
|---|---|---|---|---|
| AdaBoostClassifier | 1.0 | 1.0 | 1.0 | 1.0 |
| RandomForest | 1.0 | 1.0 | 1.0 | 1.0 |
| XGBoost | 1.0 | 1.0 | 1.0 | 1.0 |
Comments:
Our models result in approximately perfect models. Even algorithms that can prevent overfitting like Random Forest perform super well, we can eventually discard that type of error and assume our results as the right ones.
As our classification problem is about Intrusion Detection, our goal is to find out as faster as possible any dangerous connection. Hence we suggest to use the fastest model possible, in order to perform well even in short time intervals.
!pip install dash
Requirement already satisfied: dash in c:\users\lr999\anaconda3\lib\site-packages (2.0.0) Requirement already satisfied: dash-table==5.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (5.0.0) Requirement already satisfied: flask-compress in c:\users\lr999\anaconda3\lib\site-packages (from dash) (1.10.1) Requirement already satisfied: Flask>=1.0.4 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (1.1.2) Requirement already satisfied: dash-core-components==2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (2.0.0) Requirement already satisfied: dash-html-components==2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (2.0.0) Requirement already satisfied: plotly>=5.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (5.4.0) Requirement already satisfied: brotli in c:\users\lr999\anaconda3\lib\site-packages (from flask-compress->dash) (1.0.9) Requirement already satisfied: Jinja2>=2.10.1 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (2.11.2) Requirement already satisfied: click>=5.1 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (7.1.2) Requirement already satisfied: Werkzeug>=0.15 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (1.0.1) Requirement already satisfied: itsdangerous>=0.24 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (1.1.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\lr999\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (8.0.1) Requirement already satisfied: six in c:\users\lr999\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (1.15.0) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\lr999\anaconda3\lib\site-packages (from Jinja2>=2.10.1->Flask>=1.0.4->dash) (1.1.1)
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
df_nobin = df.drop(['PROTOCOL', 'PROTOCOL_MAP'], axis=1) #dropping binary features for pca
# we divide the dataset
X = df_nobin.loc[:, ~df_nobin.columns.isin(['LABEL'])]
y = df_nobin['LABEL']
X_train, X_test, y_train_pca, y_test_pca = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# intialize pca and
pca = PCA()
# to look at which component should we use to go on with our analysis -> look at the variance explained by each variable
pca.fit(X_train)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
<ipython-input-49-109eb79bffc6>:2: UserWarning: The dash_core_components package is deprecated. Please replace `import dash_core_components as dcc` with `from dash import dcc` import dash_core_components as dcc <ipython-input-49-109eb79bffc6>:3: UserWarning: The dash_html_components package is deprecated. Please replace `import dash_html_components as html` with `from dash import html` import dash_html_components as html
# We need only the calculated resulting components scores for the elements in our data set
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
df_pca_plot = pd.concat([df.reset_index(drop = True), pd.DataFrame(X_train_pca)], axis= 1)
# we generate the name of the columns
numbers = range(1, 6)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_pca_plot.columns.values[-5: ] = [j for j in comp_name]
df_pca_plot.dropna(inplace=True)
df_pca_plot
| FLOW_DURATION_MICROSECONDS | FLOW_DURATION_MILLISECONDS | FLOW_END_MILLISECONDS | IPV4_SRC_ADDR | L4_DST_PORT | L4_SRC_PORT | PROTOCOL | PROTOCOL_MAP | TCP_FLAGS | TCP_WIN_MAX_OUT | ... | TCP_WIN_SCALE_IN | SRC_TOS | DST_TOS | L7_PROTO_NAME | LABEL | Component_1 | Component_2 | Component_3 | Component_4 | Component_5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000343 | 0.000000 | 0.000000 | 0.000005 | 0.000055 | 0.000002 | 1.0 | 1.0 | 0.098655 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | -0.843740 | 0.272646 | 0.140359 | 0.133773 | 0.082753 |
| 1 | 0.001128 | 0.001013 | 0.000012 | 0.023976 | 0.446704 | 0.000020 | 1.0 | 1.0 | 0.125561 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.059846 | 0.0 | -0.679828 | -0.314779 | 0.061507 | -0.003077 | 0.002544 |
| 2 | 0.001043 | 0.001013 | 0.000015 | 0.023976 | 0.446704 | 0.000053 | 1.0 | 1.0 | 0.125561 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.059846 | 0.0 | 1.135528 | -0.191219 | 0.243563 | 0.021250 | 0.016054 |
| 3 | 0.000915 | 0.000000 | 0.000028 | 0.019814 | 1.000000 | 0.000012 | 0.0 | 0.0 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.005096 | 0.0 | -0.843715 | 0.362121 | 0.119102 | 0.115664 | 0.065566 |
| 4 | 0.033816 | 0.033435 | 0.000038 | 0.010169 | 0.019104 | 0.000220 | 1.0 | 1.0 | 0.071749 | 0.004044 | ... | 0.0 | 0.0 | 0.0 | 0.002844 | 0.0 | -0.679750 | -0.056309 | 0.000082 | -0.055380 | -0.047113 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2265364 | 0.000738 | 0.000000 | 0.960829 | 1.000000 | 0.000055 | 0.200034 | 1.0 | 1.0 | 0.098655 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | -0.843763 | 0.244310 | 0.147103 | 0.139480 | 0.088203 |
| 2265365 | 0.001430 | 0.001013 | 0.960829 | 1.000000 | 0.000055 | 0.200034 | 1.0 | 1.0 | 0.098655 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | -0.597852 | -0.524587 | 0.002154 | -0.088439 | -0.053666 |
| 2265366 | 0.000697 | 0.000000 | 0.960830 | 1.000000 | 0.000052 | 0.200034 | 1.0 | 1.0 | 0.098655 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | -0.843729 | 0.399913 | 0.110135 | 0.107980 | 0.058315 |
| 2265367 | 0.000813 | 0.000000 | 0.960830 | 1.000000 | 0.000052 | 0.200034 | 1.0 | 1.0 | 0.098655 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.106159 | 0.143752 | 0.146192 | -0.094773 | 0.006734 |
| 2265368 | 0.000922 | 0.000000 | 0.960831 | 1.000000 | 0.000052 | 0.200034 | 1.0 | 1.0 | 0.098655 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | -0.843671 | 0.293657 | 0.135322 | 0.129625 | 0.078691 |
2265369 rows × 22 columns
Converting the data frame into lists The algorithm in the apyori package is implemented in such a way that the input to the algorithm is a list of lists rather than a data frame. So we need to convert the data into a list of lists.
Fuzzy c-means (FCM) is a method of clustering which allows one piece of data to belong to two or more clusters. The clustering process is not so far from hard clustering performed by KMeans, works with centroids and recalculate them until convergence.
import skfuzzy as fuzz
def num_cluster_fuzzy (alldata, min_num_cluster,max_num_cluster):
fpcs=[]
for ncenters in range(min_num_cluster,max_num_cluster):
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(alldata, ncenters, 2, error=0.005, maxiter=1000, seed=42)
# Store fpc values for later
fpcs.append(fpc)
fig, ax = plt.subplots()
ax.plot(np.r_[min_num_cluster:max_num_cluster], fpcs)
ax.set_xlabel("Number of centers")
ax.set_ylabel("Fuzzy partition coefficient")
# decide the number of clusters
num_cluster_fuzzy(df_pca_plot,1,10)
no_y = df_pca_plot.loc[:, ~df_pca_plot.columns.isin(['LABEL'])]
fd = no_y.to_numpy()
fcm = FCM(n_clusters=2)
fcm.fit(fd)
# outputs
fcm_centers = fcm.centers
fcm_labels = fcm.predict(fd)
df_kmeans=df_pca_plot
df_kmeans['Cluster_kmeans'] = fcm_labels
# visualize the data
x_axis = df_kmeans['Component_1']
y_axis = df_kmeans['Component_2']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis, y_axis, hue=df_kmeans['Cluster_kmeans'], palette=['g', 'c'])
plt.title('Clusters KMEANS')
plt.show()
C:\Users\lr999\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Let's plot some couples of features:
x_axis = df_kmeans['FLOW_DURATION_MILLISECONDS']
y_axis = df_kmeans['TCP_WIN_MAX_OUT']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis, y_axis, hue=df_kmeans['Cluster_kmeans'], palette=['g', 'c'])
plt.title('Clusters KMEANS')
plt.show()
C:\Users\lr999\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
x_axis = df_kmeans['FLOW_DURATION_MILLISECONDS']
y_axis = df_kmeans['TCP_FLAGS']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis, y_axis, hue=df_kmeans['Cluster_kmeans'], palette=['g', 'c'])
plt.title('Clusters KMEANS')
plt.show()
C:\Users\lr999\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Let's analyze some data came out from clustering with k=2
Cluster_0 = df_kmeans[df_kmeans['Cluster_kmeans']==0]
Cluster_1 = df_kmeans[df_kmeans['Cluster_kmeans']==1]
rad0 = pd.DataFrame(index=["MAX_0", "MIN_0", "MEAN_0","MEDIAN_0","MAX_1", "MIN_1", "MEAN_1","MEDIAN_1"])
def dataframe_rad (rad, df1, df2, column):
rad[column] = [max(df1[column]),min(df1[column]),np.mean(df1[column]),statistics.median(df1[column])
,max(df2[column]),min(df2[column]),np.mean(df2[column]),statistics.median(df2[column])]
for col in df_kmeans.columns:
dataframe_rad(rad0, Cluster_0, Cluster_1, col)
display(rad0.T[:15])
| MAX_0 | MIN_0 | MEAN_0 | MEDIAN_0 | MAX_1 | MIN_1 | MEAN_1 | MEDIAN_1 | |
|---|---|---|---|---|---|---|---|---|
| FLOW_DURATION_MICROSECONDS | 0.705981 | 0.000000 | 0.005907 | 0.000849 | 0.691900 | 0.000000 | 0.006028 | 0.000848 |
| FLOW_DURATION_MILLISECONDS | 0.706180 | 0.000000 | 0.005354 | 0.000000 | 0.691996 | 0.000000 | 0.005476 | 0.000000 |
| FLOW_END_MILLISECONDS | 0.960831 | 0.000000 | 0.484295 | 0.510560 | 0.960830 | 0.000015 | 0.482507 | 0.507522 |
| IPV4_SRC_ADDR | 1.000000 | 0.000000 | 0.811073 | 1.000000 | 1.000000 | 0.000000 | 0.807559 | 1.000000 |
| L4_DST_PORT | 1.000000 | 0.000036 | 0.138701 | 0.000055 | 1.000000 | 0.000036 | 0.140927 | 0.000055 |
| L4_SRC_PORT | 1.000000 | 0.000000 | 0.556766 | 0.600056 | 1.000000 | 0.000000 | 0.552420 | 0.600056 |
| PROTOCOL | 1.000000 | 0.000000 | 0.871621 | 1.000000 | 1.000000 | 0.000000 | 0.869591 | 1.000000 |
| PROTOCOL_MAP | 1.000000 | 0.000000 | 0.871621 | 1.000000 | 1.000000 | 0.000000 | 0.869591 | 1.000000 |
| TCP_FLAGS | 0.959641 | 0.000000 | 0.086221 | 0.098655 | 0.959641 | 0.000000 | 0.086023 | 0.098655 |
| TCP_WIN_MAX_OUT | 0.601511 | 0.000000 | 0.000677 | 0.000000 | 0.735180 | 0.000000 | 0.000696 | 0.000000 |
| TCP_WIN_MIN_OUT | 0.601511 | 0.000000 | 0.000677 | 0.000000 | 0.735180 | 0.000000 | 0.000696 | 0.000000 |
| TCP_WIN_MSS_IN | 1.000000 | 0.000000 | 0.133983 | 0.162946 | 0.162946 | 0.000000 | 0.133458 | 0.162946 |
| TCP_WIN_SCALE_IN | 0.714286 | 0.000000 | 0.001658 | 0.000000 | 0.714286 | 0.000000 | 0.001681 | 0.000000 |
| SRC_TOS | 0.885375 | 0.000000 | 0.005829 | 0.000000 | 0.885375 | 0.000000 | 0.005970 | 0.000000 |
| DST_TOS | 0.948617 | 0.000000 | 0.008709 | 0.000000 | 0.948617 | 0.000000 | 0.008866 | 0.000000 |
print(Cluster_0['PROTOCOL_MAP'].value_counts())
print(Cluster_1['PROTOCOL_MAP'].value_counts())
df_kmeans_bad = df_kmeans[df_kmeans['LABEL']==1] #bad ==1 good == 0
#tcp == 1.0 udp == 0.0
print(df_kmeans_bad['PROTOCOL_MAP'].value_counts())
print(df_kmeans['PROTOCOL_MAP'].value_counts())
1.0 1127526 0.0 166070 Name: PROTOCOL_MAP, dtype: int64 1.0 845045 0.0 126728 Name: PROTOCOL_MAP, dtype: int64 1.0 1825639 Name: PROTOCOL_MAP, dtype: int64 1.0 1972571 0.0 292798 Name: PROTOCOL_MAP, dtype: int64
In Cluster_1, we can see that the TCP protocol is more used than in Cluster_0. As far as we saw form the raw data, the bad connections are only performed by TCP. UDP is equally distributed between the two clusters, unfortunately this is not what we would expect as we know that UDP is used only in normal connections.
fcm_bestk = FCM(n_clusters=6)
fcm_bestk.fit(fd)
# outputs
fcm_bestk_centers = fcm_bestk.centers
fcm_bestk_labels = fcm_bestk.predict(fd)
df_kmeans_bestk=df_pca_plot
df_kmeans_bestk['Cluster_kmeans'] = fcm_bestk_labels
# visualize the data
x_axis = df_kmeans_bestk['Component_1']
y_axis = df_kmeans_bestk['Component_2']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis, y_axis, hue=df_kmeans_bestk['Cluster_kmeans'], palette=['b', 'orange', 'g', 'r', 'c', 'm'])
plt.title('Clusters KMEANS')
plt.show()
C:\Users\lr999\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Analyze the 6 Clusters came out:
Cluster_0 = df_kmeans_bestk[df_kmeans_bestk['Cluster_kmeans']==0.0]
Cluster_1 = df_kmeans_bestk[df_kmeans_bestk['Cluster_kmeans']==1.0]
Cluster_2 = df_kmeans_bestk[df_kmeans_bestk['Cluster_kmeans']==2.0]
Cluster_3 = df_kmeans_bestk[df_kmeans_bestk['Cluster_kmeans']==3.0]
Cluster_4 = df_kmeans_bestk[df_kmeans_bestk['Cluster_kmeans']==4.0]
Cluster_5 = df_kmeans_bestk[df_kmeans_bestk['Cluster_kmeans']==5.0]
cols = ['FLOW_DURATION_MICROSECONDS']
stats_bestk = pd.DataFrame(['Max', 'Min', 'Mean', 'Median'])
for col in cols:
stats_bestk['C0_',col] = [Cluster_0[col].max(), Cluster_0[col].min(), Cluster_0[col].mean(), Cluster_0[col].median()]
stats_bestk['C1_',col] = [Cluster_1[col].max(), Cluster_1[col].min(), Cluster_1[col].mean(), Cluster_1[col].median()]
stats_bestk['C2_', col] = [Cluster_2[col].max(), Cluster_2[col].min(), Cluster_2[col].mean(), Cluster_2[col].median()]
stats_bestk['C3_',col] = [Cluster_3[col].max(), Cluster_3[col].min(), Cluster_3[col].mean(), Cluster_3[col].median()]
stats_bestk['C4_',col] = [Cluster_4[col].max(), Cluster_4[col].min(), Cluster_4[col].mean(), Cluster_4[col].median()]
stats_bestk['C5_',col] = [Cluster_5[col].max(), Cluster_5[col].min(), Cluster_5[col].mean(), Cluster_5[col].median()]
display(stats_bestk.T)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Max | Min | Mean | Median |
| (C0_, FLOW_DURATION_MICROSECONDS) | 0.6919 | 0 | 0.0274543 | 0.014334 |
| (C1_, FLOW_DURATION_MICROSECONDS) | 0.409452 | 0 | 0.00248509 | 0.00082626 |
| (C2_, FLOW_DURATION_MICROSECONDS) | 0.403013 | 0 | 0.00195107 | 0.000832335 |
| (C3_, FLOW_DURATION_MICROSECONDS) | 0.400707 | 1.01257e-06 | 0.00231757 | 0.000836385 |
| (C4_, FLOW_DURATION_MICROSECONDS) | 0.705981 | 0 | 0.0272116 | 0.014332 |
| (C5_, FLOW_DURATION_MICROSECONDS) | 0.390175 | 0 | 0.00138823 | 0.000839423 |
We can find two categories of cluster between the 6 analyzed by kmeans. According to the duration of the connection, Cluster 3 and Cluster 5 have longer connections than other clusters. It could be interesting to find out the percentage of malicious connections in those two.
print(Cluster_0['LABEL'].value_counts())
print(Cluster_1['LABEL'].value_counts())
print(Cluster_2['LABEL'].value_counts())
print(Cluster_3['LABEL'].value_counts())
print(Cluster_4['LABEL'].value_counts())
print(Cluster_5['LABEL'].value_counts())
0.0 140019 Name: LABEL, dtype: int64 1.0 519104 0.0 33527 Name: LABEL, dtype: int64 1.0 511068 0.0 22649 Name: LABEL, dtype: int64 1.0 279178 0.0 17341 Name: LABEL, dtype: int64 0.0 213851 Name: LABEL, dtype: int64 1.0 516289 0.0 12343 Name: LABEL, dtype: int64
Here is an interesting result. The clusters with connections with longer duration result in not having malicious elements in them.
So we can assume that the malicious connection will lasts much less than normal ones. This feature can be a super discriminant for a classification problem.
df_bad = df_pca_plot[df_pca_plot['LABEL']==1]
no_y = df_bad.loc[:, ~df_bad.columns.isin(['LABEL'])]
fd = no_y.to_numpy()
fcm_bad = FCM(n_clusters=6)
fcm_bad.fit(fd)
# outputs
fcm_bad_centers = fcm_bad.centers
fcm_bad_labels = fcm_bad.predict(fd)
df_kmeans_bad=df_bad
df_kmeans_bad['Cluster_kmeans'] = fcm_bad_labels
# visualize the data
x_axis = df_kmeans_bad['Component_1']
y_axis = df_kmeans_bad['Component_2']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis, y_axis, hue=df_kmeans_bad['Cluster_kmeans'], palette=['b', 'orange', 'g', 'r', 'c', 'm'])
plt.title('Clusters KMEANS_only malicious')
plt.show()
<ipython-input-72-7c3b8a949e87>:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\lr999\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. C:\Users\lr999\anaconda3\lib\site-packages\IPython\core\pylabtools.py:132: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
Cluster_0 = df_kmeans_bad[df_kmeans_bad['Cluster_kmeans']==0.0]
Cluster_1 = df_kmeans_bad[df_kmeans_bad['Cluster_kmeans']==1.0]
Cluster_2 = df_kmeans_bad[df_kmeans_bad['Cluster_kmeans']==2.0]
Cluster_3 = df_kmeans_bad[df_kmeans_bad['Cluster_kmeans']==3.0]
Cluster_4 = df_kmeans_bad[df_kmeans_bad['Cluster_kmeans']==4.0]
Cluster_5 = df_kmeans_bad[df_kmeans_bad['Cluster_kmeans']==5.0]
cols = ['FLOW_DURATION_MICROSECONDS']
stats_bad = pd.DataFrame(['Max', 'Min', 'Mean', 'Median'])
for col in cols:
stats_bad['C0_',col] = [Cluster_0[col].max(), Cluster_0[col].min(), Cluster_0[col].mean(), Cluster_0[col].median()]
stats_bad['C1_',col] = [Cluster_1[col].max(), Cluster_1[col].min(), Cluster_1[col].mean(), Cluster_1[col].median()]
stats_bad['C2_', col] = [Cluster_2[col].max(), Cluster_2[col].min(), Cluster_2[col].mean(), Cluster_2[col].median()]
stats_bad['C3_',col] = [Cluster_3[col].max(), Cluster_3[col].min(), Cluster_3[col].mean(), Cluster_3[col].median()]
stats_bad['C4_',col] = [Cluster_4[col].max(), Cluster_4[col].min(), Cluster_4[col].mean(), Cluster_4[col].median()]
stats_bad['C5_',col] = [Cluster_5[col].max(), Cluster_5[col].min(), Cluster_5[col].mean(), Cluster_5[col].median()]
display(stats_bad.T)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Max | Min | Mean | Median |
| (C0_, FLOW_DURATION_MICROSECONDS) | 0.0128728 | 0 | 0.000914491 | 0.000855624 |
| (C1_, FLOW_DURATION_MICROSECONDS) | 0.0302759 | 1.01257e-06 | 0.000873624 | 0.000811071 |
| (C2_, FLOW_DURATION_MICROSECONDS) | 0.031566 | 0 | 0.00090286 | 0.000836385 |
| (C3_, FLOW_DURATION_MICROSECONDS) | 0.0151643 | 1.01257e-06 | 0.000888273 | 0.000825247 |
| (C4_, FLOW_DURATION_MICROSECONDS) | 0.0481884 | 0 | 0.000880043 | 0.000816134 |
| (C5_, FLOW_DURATION_MICROSECONDS) | 0.0140424 | 1.01257e-06 | 0.000890004 | 0.000825247 |
As we can see, the malicious connections have a mean duration similar in every cluster analyzed. Let's see which other feature can be observed to find out some charateristics between only the instrusions.
for col in Cluster_0.columns:
print(Cluster_0[col].value_counts())
0.000842 302
0.000862 300
0.000811 287
0.000901 287
0.000914 287
...
0.006063 1
0.006231 1
0.004402 1
0.003351 1
0.006746 1
Name: FLOW_DURATION_MICROSECONDS, Length: 5001, dtype: int64
0.000000 127980
0.001013 49098
0.002026 2769
0.003040 1236
0.004053 718
0.005066 438
0.006079 260
0.007092 163
0.008105 102
0.009119 24
0.011145 13
0.010132 7
0.012158 2
Name: FLOW_DURATION_MILLISECONDS, dtype: int64
0.627986 18
0.627981 14
0.627996 13
0.627977 13
0.628018 13
..
0.695925 1
0.762917 1
0.796049 1
0.832705 1
0.749877 1
Name: FLOW_END_MILLISECONDS, Length: 136837, dtype: int64
1.0 182810
Name: IPV4_SRC_ADDR, dtype: int64
0.000052 91232
0.000055 56509
0.000058 19330
0.000062 5876
0.000049 2076
...
0.000276 1
0.000581 1
0.003960 1
0.000224 1
0.000331 1
Name: L4_DST_PORT, Length: 137, dtype: int64
1.0 182810
Name: L4_SRC_PORT, dtype: int64
1.0 182810
Name: PROTOCOL, dtype: int64
1.0 182810
Name: PROTOCOL_MAP, dtype: int64
0.098655 182810
Name: TCP_FLAGS, dtype: int64
0.0 182810
Name: TCP_WIN_MAX_OUT, dtype: int64
0.0 182810
Name: TCP_WIN_MIN_OUT, dtype: int64
0.162946 182810
Name: TCP_WIN_MSS_IN, dtype: int64
0.0 182810
Name: TCP_WIN_SCALE_IN, dtype: int64
0.0 182810
Name: SRC_TOS, dtype: int64
0.0 182810
Name: DST_TOS, dtype: int64
1.000000 182524
0.000014 43
0.000070 14
0.000048 13
0.000018 10
0.000029 9
0.000015 9
0.000432 9
0.000054 9
0.000108 9
0.000045 9
0.000032 9
0.000682 8
0.000028 7
0.000019 7
0.000034 7
0.000098 7
0.000109 6
0.000096 6
0.000016 5
0.000104 5
0.000056 5
0.000032 4
0.003000 4
0.000040 4
0.000108 4
0.000042 4
0.000022 4
0.000017 4
0.000049 3
0.000015 3
0.000029 3
0.000017 3
0.000041 3
0.000021 3
0.000270 3
0.083964 3
0.059846 3
0.000037 3
0.000018 3
0.000021 2
0.000016 2
0.000023 2
0.000140 2
0.001386 2
0.000323 2
0.000915 2
0.000091 2
0.000027 1
0.000039 1
0.001908 1
Name: L7_PROTO_NAME, dtype: int64
1.0 182810
Name: LABEL, dtype: int64
1.121628 4
1.114599 3
1.133104 3
1.130190 3
1.129742 3
..
1.110758 1
1.093196 1
1.125074 1
1.118842 1
1.100858 1
Name: Component_1, Length: 181896, dtype: int64
0.150924 4
0.384347 3
0.166469 3
0.187977 3
-0.192467 3
..
-0.071867 1
-0.274396 1
0.421047 1
-0.004288 1
0.016808 1
Name: Component_2, Length: 181896, dtype: int64
0.153966 4
0.113785 3
0.265281 3
0.215548 3
0.154997 3
..
0.083257 1
0.206750 1
0.251188 1
0.203024 1
0.207734 1
Name: Component_3, Length: 181896, dtype: int64
-0.069850 4
-0.100573 3
-0.025103 3
-0.020210 3
-0.035788 3
..
-0.120841 1
-0.089798 1
-0.103560 1
-0.142776 1
-0.123968 1
Name: Component_4, Length: 181896, dtype: int64
-0.024964 4
0.044715 3
0.011769 3
-0.080965 3
0.000889 3
..
-0.017547 1
0.031062 1
-0.069522 1
-0.058584 1
-0.034317 1
Name: Component_5, Length: 181896, dtype: int64
0 182810
Name: Cluster_kmeans, dtype: int64
The only features that can be taken in consideration are L4_SRC_PORT and L4_DST_PORT. They describe the source port and the destination port rispectively.
clusters = []
clusters.append(Cluster_0)
clusters.append(Cluster_1)
clusters.append(Cluster_2)
clusters.append(Cluster_3)
clusters.append(Cluster_4)
clusters.append(Cluster_5)
i = 0
for cl in clusters:
print('Cluster_', i)
print(cl['L4_DST_PORT'].value_counts())
#print(cl['L4_DST_PORT'].value_counts())
i = i+1
Cluster_ 0
0.000052 91232
0.000055 56509
0.000058 19330
0.000062 5876
0.000049 2076
...
0.000276 1
0.000581 1
0.003960 1
0.000224 1
0.000331 1
Name: L4_DST_PORT, Length: 137, dtype: int64
Cluster_ 1
0.000052 162930
0.000055 101750
0.000058 34611
0.000062 10555
0.000049 3684
...
0.000266 2
0.001633 2
0.000448 2
0.008524 1
0.000036 1
Name: L4_DST_PORT, Length: 141, dtype: int64
Cluster_ 2
0.000052 255826
0.000055 160326
0.000058 55007
0.000062 16642
0.000049 5851
...
0.000386 4
0.002811 4
0.000461 3
0.000678 3
0.005119 1
Name: L4_DST_PORT, Length: 142, dtype: int64
Cluster_ 3
0.000052 105773
0.000055 66081
0.000058 22689
0.000062 6923
0.000049 2530
...
0.001961 1
0.001036 1
0.004976 1
0.000344 1
0.001315 1
Name: L4_DST_PORT, Length: 140, dtype: int64
Cluster_ 4
0.000052 257418
0.000055 161233
0.000058 54804
0.000062 16565
0.000049 5845
...
0.001961 4
0.000308 4
0.020263 4
0.000302 4
0.005119 1
Name: L4_DST_PORT, Length: 142, dtype: int64
Cluster_ 5
0.000052 32379
0.000055 20399
0.000058 7091
0.000062 2113
0.000049 741
...
0.001633 1
0.000266 1
0.000786 1
0.000393 1
0.000175 1
Name: L4_DST_PORT, Length: 114, dtype: int64
It seems that in Cluster_3 there is only one Source port.
values = np.append(Cluster_0['L4_DST_PORT'].values,Cluster_1['L4_DST_PORT'].values)
values = np.append(values, Cluster_2['L4_DST_PORT'].values)
values = np.append(values, Cluster_3['L4_DST_PORT'].values)
values = np.append(values, Cluster_4['L4_DST_PORT'].values)
values = np.append(values, Cluster_5['L4_DST_PORT'].values)
#label encoding of the ports, in this way we can calculate the percentage then
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(values)
for cl in clusters:
cl['L4_DST_PORT'] = le.transform(cl['L4_DST_PORT'])
<ipython-input-81-fc5dc13c3f6c>:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
After Label encoding the most common ports are 5,6,7,8 Let's check the percentage of those ports in the different clusters.
i = 0
for cl in clusters:
print('Cluster_',i)
print(len(cl[cl['L4_DST_PORT']==5])/cl.shape[0]*100)
print(len(cl[cl['L4_DST_PORT']==6])/cl.shape[0]*100)
print(len(cl[cl['L4_DST_PORT']==7])/cl.shape[0]*100)
print(len(cl[cl['L4_DST_PORT']==8])/cl.shape[0]*100)
i = i + 1
Cluster_ 0 0.0 0.0 0.0 0.0 Cluster_ 1 1.1222879564245196 49.634738528840124 30.996959708521953 10.543840515691926 Cluster_ 2 1.1332800040287514 49.550929808692416 31.053537844114437 10.6543040816287 Cluster_ 3 1.1850450129746035 49.5437810898667 30.952157906076987 10.62746494046671 Cluster_ 4 1.1259786092960178 49.58890703982246 31.05986468992726 10.557422019479718 Cluster_ 5 1.1281286157968455 49.2951099202241 31.056269411119906 10.795627550088302
The Destination port is not a good discriminant between different clusters.
Although PCA reveals its utility in finding components useful for clustering.
values = np.append(Cluster_0['L4_SRC_PORT'].values,Cluster_1['L4_SRC_PORT'].values)
values = np.append(values, Cluster_2['L4_SRC_PORT'].values)
values = np.append(values, Cluster_3['L4_SRC_PORT'].values)
values = np.append(values, Cluster_4['L4_SRC_PORT'].values)
values = np.append(values, Cluster_5['L4_SRC_PORT'].values)
le.fit(values)
for cl in clusters:
cl['L4_SRC_PORT'] = le.transform(cl['L4_SRC_PORT'])
<ipython-input-83-5cf7e1fb32f9>:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
i = 0
for cl in clusters:
print('Cluster_', i)
#print(cl['L4_DST_PORT'].value_counts())
print(cl['L4_SRC_PORT'].value_counts())
i = i+1
Cluster_ 0
65 182810
Name: L4_SRC_PORT, dtype: int64
Cluster_ 1
64 110291
63 109804
62 73566
61 33463
60 188
57 111
59 93
25 63
55 56
28 55
21 40
23 40
29 39
22 36
27 34
33 34
26 32
20 29
19 27
50 21
24 20
31 20
18 20
30 16
35 14
32 14
36 12
34 12
37 10
38 10
40 10
41 10
43 8
17 6
13 5
14 5
15 5
42 4
53 4
16 3
39 3
52 3
48 3
12 2
45 2
58 2
44 2
46 2
0 2
49 1
47 1
10 1
8 1
7 1
6 1
5 1
Name: L4_SRC_PORT, dtype: int64
Cluster_ 2
65 300287
63 82657
64 79293
61 29046
62 24461
60 145
57 43
28 26
27 25
29 22
33 21
19 21
23 20
25 20
24 19
50 17
20 16
37 12
34 12
22 12
21 11
26 10
30 8
31 8
18 8
44 7
35 7
32 6
40 6
36 6
38 5
15 4
39 3
43 3
45 3
58 3
53 2
52 2
46 2
48 1
49 1
41 1
47 1
56 1
59 1
17 1
16 1
42 1
14 1
Name: L4_SRC_PORT, dtype: int64
Cluster_ 3
65 76736
64 46022
63 45586
62 30841
61 13833
60 83
57 47
59 44
55 25
25 25
28 20
29 18
19 16
26 16
24 15
33 15
23 14
34 12
22 11
20 10
21 9
35 9
50 8
27 8
17 6
32 6
15 6
31 5
30 5
36 5
39 5
37 4
38 4
40 4
53 3
44 3
14 3
45 2
7 1
1 1
16 1
48 1
18 1
58 1
47 1
43 1
42 1
0 1
Name: L4_SRC_PORT, dtype: int64
Cluster_ 4
64 142991
63 140595
62 123606
65 71358
61 38758
...
54 1
5 1
51 1
2 1
0 1
Name: L4_SRC_PORT, Length: 62, dtype: int64
Cluster_ 5
65 23516
64 14265
63 14214
62 9451
61 4078
60 27
57 14
29 13
55 13
20 9
59 8
23 7
25 7
22 6
34 6
19 5
26 5
28 5
27 4
33 3
36 3
21 3
15 2
35 2
24 2
40 2
43 2
13 1
9 1
7 1
58 1
10 1
50 1
47 1
31 1
38 1
39 1
44 1
32 1
Name: L4_SRC_PORT, dtype: int64
i = 0
for cl in clusters:
print('Cluster_',i)
print(len(cl[cl['L4_SRC_PORT']==63])/cl.shape[0]*100)
print(len(cl[cl['L4_SRC_PORT']==64])/cl.shape[0]*100)
print(len(cl[cl['L4_SRC_PORT']==65])/cl.shape[0]*100)
print(len(cl[cl['L4_SRC_PORT']==66])/cl.shape[0]*100)
print(len(cl[cl['L4_SRC_PORT']==62])/cl.shape[0]*100)
i = i +1
Cluster_ 0 0.0 0.0 100.0 0.0 0.0 Cluster_ 1 33.45051758068349 33.598876493489875 0.0 0.0 22.411030348079862 Cluster_ 2 16.009831702786617 15.358258649709756 58.1625794855207 0.0 4.737850312518764 Cluster_ 3 21.352356506506037 21.55657770241787 35.94293048048189 0.0 14.445839227331916 Cluster_ 4 27.084168105042533 27.545732647022565 13.746378375046234 0.0 23.81141351251387 Cluster_ 5 21.639973205042324 21.717617684672067 35.80171731319652 0.0 14.388587783935206
Source port reveals as a good discriminant.
If we see at port encoded as 66, the 100% of its occurrences are in C3, which is composed by connection coming only from that port.
C5 and C4 has almost no connection from port 66.
Maybe a certain type of malicious connection comes from that port.
import scipy.cluster.hierarchy as shc
X_frac = pca.fit_transform(X_train.sample(frac=0.01))
plt.figure(figsize=(10, 10))
plt.title("Dendograms")
dend = shc.dendrogram(shc.linkage(X_frac, method='ward'), truncate_mode='level', p=4)
It seems ok to perform a clustering with k=2, according to the dendogram results.
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
yhat = cluster.fit_predict(X_frac)
from numpy import unique
from numpy import where
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
# get row indexes for samples with this cluster
row_ix = where(yhat == cluster)
# create scatter of these samples
pyplot.scatter(X_frac[row_ix, 0], X_frac[row_ix, 1])
# show the plot
plt.title('Agglomerative clustering')
pyplot.show()
As we can see, the results are not so far from our Fuzzy Kmeans
The Superlearner is an ensembled algorithm which fits different models performing K-Fold CrossValidation on each of them. After that select model which performed best.
The procedure can be summarized as follows:
1. Select a k-fold split of the training dataset.
2. Select m base-models or model configurations.
3. For each basemodel:
a. Evaluate using k-fold cross-validation.
b. Store all out-of-fold predictions.
c. Fit the model on the full training dataset and store.
4. Fit a meta-model on the out-of-fold predictions.
5. Evaluate the model on a holdout dataset or use model to make predictions.
The final result should be no worse than the best performing model evaluated during k-fold cross-validation and has the likelihood of performing better than any single model.
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold
from numpy import hstack
from numpy import vstack
from numpy import asarray
X = df.loc[:, ~df.columns.isin(['LABEL'])]
y = df['LABEL']
X, X_val, y, y_val = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)
# create a list of base-models
def get_models():
models = list()
models.append(LogisticRegression(solver='liblinear', random_state = 42))
models.append(DecisionTreeClassifier(random_state = 42))
models.append(GaussianNB())
models.append(AdaBoostClassifier(random_state = 42))
models.append(BaggingClassifier(n_estimators=10, random_state = 42))
models.append(RandomForestClassifier(n_estimators=10, random_state = 42))
models.append(ExtraTreesClassifier(n_estimators=10, random_state = 42))
return models
def get_out_of_fold_predictions(X, y, models):
meta_X, meta_y = list(), list()
kfold = KFold(n_splits=10, shuffle=True)
for train_ix, test_ix in kfold.split(X):
fold_yhats = list()
train_X, test_X = X.iloc[train_ix], X.iloc[test_ix]
train_y, test_y = y.iloc[train_ix], y.iloc[test_ix]
meta_y.extend(test_y)
for model in models:
print("model")
model.fit(train_X, train_y)
yhat = model.predict_proba(test_X)
fold_yhats.append(yhat)
meta_X.append(hstack(fold_yhats))
return vstack(meta_X), asarray(meta_y)
# fit all base models on the training dataset
def fit_base_models(X, y, models):
for model in models:
model.fit(X, y)
def fit_meta_model(X, y):
model = LogisticRegression(solver='liblinear')
model.fit(X, y)
return model
# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
for model in models:
yhat = model.predict(X)
acc = accuracy_score(y, yhat)
print('%s: %.3f' % (model.__class__.__name__, acc*100))
# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
meta_X = list()
for model in models:
yhat = model.predict_proba(X)
meta_X.append(yhat)
meta_X = hstack(meta_X)
# predict
return meta_model.predict(meta_X)
models = get_models()
Train (2265369, 16) (2265369,) Test (970873, 16) (970873,)
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)
model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model model Meta (2265369, 14) (2265369,)
# fit base models
fit_base_models(X, y, models)
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)
# evaluate base models
evaluate_models(X_val, y_val, models)
LogisticRegression: 100.000 DecisionTreeClassifier: 100.000 GaussianNB: 99.999 AdaBoostClassifier: 100.000 BaggingClassifier: 100.000 RandomForestClassifier: 100.000 ExtraTreesClassifier: 100.000
# evaluate meta model
yhat = super_learner_predictions(X_val, models, meta_model)
print('Super Learner: %.3f' % (accuracy_score(y_val, yhat) * 100))
print('Super Learner: %.3f' % (precision_score(y_val, yhat) * 100))
print('Super Learner: %.3f' % (recall_score(y_val, yhat) * 100))
print('Super Learner: %.3f' % (f1_score(y_val, yhat) * 100))
Super Learner: 100.000 Super Learner: 100.000 Super Learner: 100.000 Super Learner: 100.000
The SuperLearner results are in line with the previous ones. Even the best predictor gives a 100% Accuracy, Precision, Recall and f1 score.